import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
pd.options.display.float_format = '{:.2f}'.format
from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from statistics import mean
import plotly.express as px
from sklearn.preprocessing import RobustScaler, QuantileTransformer
import datetime
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from category_encoders.one_hot import OneHotEncoder
import matplotlib.ticker as mtick
# Si se desea descargar el dataset directamente del servidor de kaggle, requiere user password
# od.download("https://www.kaggle.com/datasets/ahsan81/superstore-marketing-campaign-dataset")
Antes de realizar cualquier tipo de proceso de machine learning, es importante conocer los datos con los que se trabajaran
# Lectura de los datos
df = pd.read_csv("../data/superstore_data.csv")
base = df
# 2240 filas x 22 columnas
base.shape
(2240, 22)
# Distribucion de la variable objetivo
cuenta = base["Response"].value_counts()
# Estilo por default para graficos
sns.set_theme(style = "white", palette = ("Accent"))
# Grafico de distribucion de la variable objetivo
# plot data
fig, ax = plt.subplots(figsize=(5, 5))
d = sns.barplot(x = base.Response.value_counts().index, y = cuenta, ax = ax)
# annotate
ax.bar_label(ax.containers[0], label_type = 'edge')
# pad the spacing between the number and the edge of the figure
ax.margins(y = 0.1)
ax.set_title('Distribucion variable Response')
plt.show()
fig.savefig('../img/objetivo.png')
# En porcentajes
porcentajes = base.Response.value_counts(normalize = True) * 100
porcentajes
0 85.09 1 14.91 Name: Response, dtype: float64
# Descripcion basica de los datos a analizar
base.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | 2240.00 | NaN | NaN | NaN | 5592.16 | 3246.66 | 0.00 | 2828.25 | 5458.50 | 8427.75 | 11191.00 |
| Year_Birth | 2240.00 | NaN | NaN | NaN | 1968.81 | 11.98 | 1893.00 | 1959.00 | 1970.00 | 1977.00 | 1996.00 |
| Education | 2240 | 5 | Graduation | 1127 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 2240 | 8 | Married | 864 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income | 2216.00 | NaN | NaN | NaN | 52247.25 | 25173.08 | 1730.00 | 35303.00 | 51381.50 | 68522.00 | 666666.00 |
| Kidhome | 2240.00 | NaN | NaN | NaN | 0.44 | 0.54 | 0.00 | 0.00 | 0.00 | 1.00 | 2.00 |
| Teenhome | 2240.00 | NaN | NaN | NaN | 0.51 | 0.54 | 0.00 | 0.00 | 0.00 | 1.00 | 2.00 |
| Dt_Customer | 2240 | 663 | 8/31/2012 | 12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Recency | 2240.00 | NaN | NaN | NaN | 49.11 | 28.96 | 0.00 | 24.00 | 49.00 | 74.00 | 99.00 |
| MntWines | 2240.00 | NaN | NaN | NaN | 303.94 | 336.60 | 0.00 | 23.75 | 173.50 | 504.25 | 1493.00 |
| MntFruits | 2240.00 | NaN | NaN | NaN | 26.30 | 39.77 | 0.00 | 1.00 | 8.00 | 33.00 | 199.00 |
| MntMeatProducts | 2240.00 | NaN | NaN | NaN | 166.95 | 225.72 | 0.00 | 16.00 | 67.00 | 232.00 | 1725.00 |
| MntFishProducts | 2240.00 | NaN | NaN | NaN | 37.53 | 54.63 | 0.00 | 3.00 | 12.00 | 50.00 | 259.00 |
| MntSweetProducts | 2240.00 | NaN | NaN | NaN | 27.06 | 41.28 | 0.00 | 1.00 | 8.00 | 33.00 | 263.00 |
| MntGoldProds | 2240.00 | NaN | NaN | NaN | 44.02 | 52.17 | 0.00 | 9.00 | 24.00 | 56.00 | 362.00 |
| NumDealsPurchases | 2240.00 | NaN | NaN | NaN | 2.33 | 1.93 | 0.00 | 1.00 | 2.00 | 3.00 | 15.00 |
| NumWebPurchases | 2240.00 | NaN | NaN | NaN | 4.08 | 2.78 | 0.00 | 2.00 | 4.00 | 6.00 | 27.00 |
| NumCatalogPurchases | 2240.00 | NaN | NaN | NaN | 2.66 | 2.92 | 0.00 | 0.00 | 2.00 | 4.00 | 28.00 |
| NumStorePurchases | 2240.00 | NaN | NaN | NaN | 5.79 | 3.25 | 0.00 | 3.00 | 5.00 | 8.00 | 13.00 |
| NumWebVisitsMonth | 2240.00 | NaN | NaN | NaN | 5.32 | 2.43 | 0.00 | 3.00 | 6.00 | 7.00 | 20.00 |
| Response | 2240.00 | NaN | NaN | NaN | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Complain | 2240.00 | NaN | NaN | NaN | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
Algunas observaciones:
# Descripcion del tipo de dato de cada columna
base.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null float64 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null object 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 Response 2240 non-null int64 21 Complain 2240 non-null int64 dtypes: float64(1), int64(18), object(3) memory usage: 385.1+ KB
# Duplicados
# No hay duplicados
print(base.duplicated().sum())
0
# Encontrar valores nulos
# Solo seria la columna income, valores en porcentaje
base.isnull().mean().round(4).mul(100).sort_values(ascending=False)
Income 1.07 Id 0.00 MntFishProducts 0.00 Response 0.00 NumWebVisitsMonth 0.00 NumStorePurchases 0.00 NumCatalogPurchases 0.00 NumWebPurchases 0.00 NumDealsPurchases 0.00 MntGoldProds 0.00 MntSweetProducts 0.00 MntMeatProducts 0.00 Year_Birth 0.00 MntFruits 0.00 MntWines 0.00 Recency 0.00 Dt_Customer 0.00 Teenhome 0.00 Kidhome 0.00 Marital_Status 0.00 Education 0.00 Complain 0.00 dtype: float64
# Histograma de las columnas de pandas
# base.hist(figsize=(13,13))
# plt.show()
# select the columns to be plotted
cols = base.select_dtypes(exclude = ['object'])
# create the figure and axes
fig, axes = plt.subplots(nrows = 4, ncols = 4, figsize = (20,8))
axes = axes.ravel() # flattening the array makes indexing easier
for col, ax in zip(cols, axes):
sns.histplot(data = base[col], kde = True, stat = 'density', ax = ax)
fig.tight_layout()
plt.show()
Hay variables que probablemente se deban transformar
Se estudian en particular
data = base.copy()
# Calculo anio
today = datetime.date.today()
year = today.year
# Se elimina la columna ID
data.drop(columns = ["Id"], inplace = True)
# La columna Dt_Customer pasa a contar solo con el anio
data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'])
# Se crea una columna edad que es la resta del anio actual con el de la columna
data["Edad"] = year - pd.to_datetime(data["Year_Birth"], format = "%Y").apply(lambda x: x.year)
data["Edad"].sort_values()
562 27
1824 27
697 28
1468 28
964 28
...
1740 82
2171 83
2233 123
827 124
513 130
Name: Edad, Length: 2240, dtype: int64
# La columna edad tiene valores "excesivos" son posibles outliers
data[data["Edad"]> 100]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 513 | 1893 | 2n Cycle | Single | 60182.00 | 0 | 1 | 2014-05-17 | 23 | 8 | 0 | ... | 0 | 2 | 1 | 1 | 0 | 2 | 4 | 0 | 0 | 130 |
| 827 | 1899 | PhD | Together | 83532.00 | 0 | 0 | 2013-09-26 | 36 | 755 | 144 | ... | 64 | 224 | 1 | 4 | 6 | 4 | 1 | 0 | 0 | 124 |
| 2233 | 1900 | 2n Cycle | Divorced | 36640.00 | 1 | 0 | 2013-09-26 | 99 | 15 | 6 | ... | 4 | 25 | 1 | 2 | 1 | 2 | 5 | 0 | 1 | 123 |
3 rows × 22 columns
# Es mejor eliminarlos
data.drop(data[data["Edad"] > 100].index, inplace=True)
# La columna Dt_Customer puede entregar algunas columnas adicionales interesantes
# Anio de registro
data["Dt_Customer_year"] = data["Dt_Customer"].apply(lambda x: x.year)
# Semestre
data["Dt_Customer_q"] = data["Dt_Customer"].apply(lambda x: x.quarter)
# Mes
data["Dt_Customer_m"] = data["Dt_Customer"].apply(lambda x: x.month)
# Se calcula el tiempo de participacion
data['tiempo_participacion'] = year - data.Dt_Customer_year
# Columna Income se imputa por la media los valores nulos
data = data.fillna(data.mean(numeric_only = True))
# Check
data.isnull().mean().round(4).mul(100).sort_values(ascending = False)
Year_Birth 0.00 Education 0.00 Dt_Customer_m 0.00 Dt_Customer_q 0.00 Dt_Customer_year 0.00 Edad 0.00 Complain 0.00 Response 0.00 NumWebVisitsMonth 0.00 NumStorePurchases 0.00 NumCatalogPurchases 0.00 NumWebPurchases 0.00 NumDealsPurchases 0.00 MntGoldProds 0.00 MntSweetProducts 0.00 MntFishProducts 0.00 MntMeatProducts 0.00 MntFruits 0.00 MntWines 0.00 Recency 0.00 Dt_Customer 0.00 Teenhome 0.00 Kidhome 0.00 Income 0.00 Marital_Status 0.00 tiempo_participacion 0.00 dtype: float64
# Se agrega una nueva columna para saber el total gastado en compras por cliente
col_list = ["MntFishProducts","MntMeatProducts","MntFruits", "MntSweetProducts", "MntWines", "MntGoldProds"]
data['Total_Compras'] = data[col_list].sum(axis = 1)
Tratamiento de categoricas, recategorizar levels etc
"Education", Se debe recategorizar
"Marital_Status", Se debe recategorizar
"Kidhome", niveles 0,1,2
"Teenhome",mismo anterior
"Complain",
"Dt_Customer_year"
"Dt_Customer_q"
"Dt_Customer_q"
data.Dt_Customer_year.value_counts()
2013 1187 2014 556 2012 494 Name: Dt_Customer_year, dtype: int64
data.Education.value_counts()
Graduation 1127 PhD 485 Master 370 2n Cycle 201 Basic 54 Name: Education, dtype: int64
data['Education'] = data['Education'].replace('2n Cycle', 'Master')
data.Education.value_counts()
Graduation 1127 Master 571 PhD 485 Basic 54 Name: Education, dtype: int64
data.Marital_Status.value_counts()
Married 864 Together 579 Single 479 Divorced 231 Widow 77 Alone 3 YOLO 2 Absurd 2 Name: Marital_Status, dtype: int64
# Se recategorizan las categorias escasamente representadas
data.Marital_Status = data.Marital_Status.replace(['Alone', 'YOLO', 'Absurd', 'Widow'], 'Single')
data.Marital_Status = data.Marital_Status.replace('Together', 'Married')
data.Marital_Status.value_counts()
Married 1443 Single 563 Divorced 231 Name: Marital_Status, dtype: int64
# Revision posibles categoricas
categoricas = data[['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'Complain','Dt_Customer_year']]
for feature in categoricas:
print(f'{feature}: {data[feature].unique()}')
Education: ['Graduation' 'PhD' 'Master' 'Basic'] Marital_Status: ['Divorced' 'Single' 'Married'] Kidhome: [0 1 2] Teenhome: [0 1 2] Complain: [0 1] Dt_Customer_year: [2014 2013 2012]
# Analisis nivel educativo
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x ='Education')
ax.set_title('Nivel educativo')
plt.show()
fig.savefig('../img/educacion.png')
print (f'Un cliente con nivel educativo graduation tiene una probabilidad de un {round(data[data["Education"] == "Graduation"]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con nivel educativo Phd tiene una probabilidad de un {round(data[data["Education"] == "PhD"]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con nivel educativo Master tiene una probabilidad de un {round(data[data["Education"] == "Master"]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con nivel educativo Basic tiene una probabilidad de un {round(data[data["Education"] == "Basic"]["Response"].mean()*100,2)} % de tomar la oferta')
Un cliente con nivel educativo graduation tiene una probabilidad de un 13.49 % de tomar la oferta Un cliente con nivel educativo Phd tiene una probabilidad de un 20.82 % de tomar la oferta Un cliente con nivel educativo Master tiene una probabilidad de un 13.84 % de tomar la oferta Un cliente con nivel educativo Basic tiene una probabilidad de un 3.7 % de tomar la oferta
# Analisis estado civil
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x ='Marital_Status')
ax.set_title('Estado civil')
plt.show()
fig.savefig('../img/civil.png')
print (f'Un cliente con estado civil Casado tiene una probabilidad de un {round(data[data["Marital_Status"] == "Married"]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con estado civil Soltero tiene una probabilidad de un {round(data[data["Marital_Status"] == "Single"]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con estado civil Divorciado tiene una probabilidad de un {round(data[data["Marital_Status"] == "Divorced"]["Response"].mean()*100,2)} % de tomar la oferta')
Un cliente con estado civil Casado tiene una probabilidad de un 10.95 % de tomar la oferta Un cliente con estado civil Soltero tiene una probabilidad de un 22.74 % de tomar la oferta Un cliente con estado civil Divorciado tiene una probabilidad de un 20.78 % de tomar la oferta
# Analisis kidhome
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x ='Kidhome')
ax.set_title('Niños en casa')
plt.show()
fig.savefig('../img/kids.png')
print (f'Un cliente sin niños tiene una probabilidad de un {round(data[data["Kidhome"] == 0]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con 1 niño en casa una probabilidad de un {round(data[data["Kidhome"] == 1]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con 2 niños en casa una probabilidad de un {round(data[data["Kidhome"] == 2]["Response"].mean()*100,2)} % de tomar la oferta')
Un cliente sin niños tiene una probabilidad de un 17.2 % de tomar la oferta Un cliente con 1 niño en casa una probabilidad de un 12.25 % de tomar la oferta Un cliente con 2 niños en casa una probabilidad de un 4.17 % de tomar la oferta
# Analisis teenhome
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x = 'Teenhome')
ax.set_title('Adolescentes en casa')
plt.show()
fig.savefig('../img/teen.png')
print (f'Un cliente sin adolescentes en casa tiene una probabilidad de un {round(data[data["Teenhome"] == 0]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con 1 adolescentes en casa tiene una probabilidad de un {round(data[data["Teenhome"] == 1]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con 2 adolescentes en casa una probabilidad de un {round(data[data["Kidhome"] == 2]["Response"].mean()*100,2)} % de tomar la oferta')
Un cliente sin adolescentes en casa tiene una probabilidad de un 20.5 % de tomar la oferta Un cliente con 1 adolescentes en casa tiene una probabilidad de un 8.94 % de tomar la oferta Un cliente con 2 adolescentes en casa una probabilidad de un 4.17 % de tomar la oferta
# Analisis reclamos
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x ='Complain')
ax.set_title('Reclamos')
plt.show()
fig.savefig('../img/reclamos.png')
print (f'Un cliente sin reclamos tiene una probabilidad de {round(data[data["Complain"] == 0]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente con algun reclamo tiene una probabilidad de {round(data[data["Complain"] == 1]["Response"].mean()*100,2)} % de tomar la oferta')
Un cliente sin reclamos tiene una probabilidad de 14.93 % de tomar la oferta Un cliente con algun reclamo tiene una probabilidad de 15.0 % de tomar la oferta
# Analisis anio registro
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x ='Dt_Customer_year')
ax.set_title('Año registro')
plt.show()
fig.savefig('../img/aregistro.png')
print (f'Un cliente registrado el 2012 tiene una probabilidad de {round(data[data["Dt_Customer_year"] == 2012]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente registrado el 2013 tiene una probabilidad de {round(data[data["Dt_Customer_year"] == 2013]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Un cliente registrado el 2013 tiene una probabilidad de {round(data[data["Dt_Customer_year"] == 2014]["Response"].mean()*100,2)} % de tomar la oferta')
Un cliente registrado el 2012 tiene una probabilidad de 26.72 % de tomar la oferta Un cliente registrado el 2013 tiene una probabilidad de 12.97 % de tomar la oferta Un cliente registrado el 2013 tiene una probabilidad de 8.63 % de tomar la oferta
# Analisis mes
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x ='Dt_Customer_m')
ax.set_title('Mes registro')
plt.show()
fig.savefig('../img/mregistro.png')
# Analisis semestre
fig, ax = plt.subplots(figsize = (5, 5))
sns.countplot(data = data, x ='Dt_Customer_q')
ax.set_title('Trimestre registro')
fig.savefig('../img/sregistro.png')
plt.show()
# Boxplot para saber niveles de respuesta a oferta
columnas = data[
[
"MntWines",
"MntGoldProds",
"MntMeatProducts",
"MntFruits",
"MntFishProducts",
"MntSweetProducts",
]
].columns.tolist()
plt.figure(figsize = (8, 8))
for i, variable in enumerate(columnas):
plt.subplot(3, 3, i + 1)
sns.boxplot(data["Response"], data[variable])
plt.tight_layout()
plt.title(variable)
plt.savefig('../img/compras_{}.png'.format(i), format = "PNG") # Guarda en formato png
plt.show()
A mayor monto que se ha gastado en los items descritos en los plots de arriba, aumenta el nivel de respuesta positiva a la campana
# Boxplot para saber niveles de respuesta a oferta
columnas = data[
[
"NumStorePurchases",
"NumCatalogPurchases",
"NumWebPurchases",
"NumDealsPurchases",
]
].columns.tolist()
plt.figure(figsize=(10, 10))
for i, variable in enumerate(columnas):
plt.subplot(3, 2, i + 1)
sns.boxplot(data["Response"], data[variable])
plt.tight_layout()
plt.title(variable)
plt.savefig('../img/lugar_{}.png'.format(i), format = "PNG") # Guarda en formato png
plt.show()
# Boxplot para saber niveles de respuesta a oferta
columnas = data[
[
"Education",
"Marital_Status",
"Kidhome",
"Teenhome"
]
].columns.tolist()
plt.figure(figsize=(8, 8))
for i, variable in enumerate(columnas):
plt.subplot(3, 2, i + 1)
sns.boxplot(data[variable], data["Total_Compras"], palette='pastel', orient = "vertical")
plt.tight_layout()
plt.title(variable)
plt.savefig('../img/total_{}.png'.format(i), format="PNG") # Guarda en formato png
plt.show()
La mayor cantidad de dinero gastado en compras es para las personas con nivel educativo alto, solteros, sin niños ni adolescentes pequeños
# Total compras vs edad
plt.figure(figsize=(12, 5))
sns.lineplot(y = data.Total_Compras, x = data.Edad, color='#028ca1')
plt.show()
fig.savefig('../img/temp.png')
# Boxplot para saber niveles de respuesta a oferta
columnas = data[
[
"NumWebVisitsMonth",
"Recency"
]
].columns.tolist()
# plt.figure(figsize=(15, 10))
for i, variable in enumerate(columnas):
plt.subplot(1, 2, i + 1)
sns.boxplot(data["Response"], data[variable])
plt.tight_layout()
plt.title(variable)
plt.savefig('../img/visitas_{}.png'.format(i), format = "PNG") # Guarda en formato png
plt.show()
print (f'Visitas a la web {round(data[data["NumWebVisitsMonth"] == 0]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Visitas a la web {round(data[data["NumWebVisitsMonth"] == 1]["Response"].mean()*100,2)} % de tomar la oferta')
Visitas a la web 0.0 % de tomar la oferta Visitas a la web 19.74 % de tomar la oferta
print (f'Recency {round(data[data["Recency"] == 0]["Response"].mean()*100,2)} % de tomar la oferta')
print (f'Recency {round(data[data["Recency"] == 1]["Response"].mean()*100,2)} % de tomar la oferta')
Recency 25.0 % de tomar la oferta Recency 37.5 % de tomar la oferta
# Boxplot para saber niveles de respuesta a oferta
columnas = data[
[
"Edad",
"tiempo_participacion",
"Income",
"Total_Compras"
]
].columns.tolist()
plt.figure(figsize=(10, 10))
for i, variable in enumerate(columnas):
plt.subplot(3, 2, i + 1)
sns.boxplot(data["Response"], data[variable], palette='Accent')
plt.tight_layout()
plt.title(variable)
plt.savefig('../img/varios_{}.png'.format(i), format = "PNG") # Guarda en formato png
plt.show()
# Usar si se quiere generar boxplots para todas las variables numericas
# cols = data.select_dtypes(include=np.number)
# #create the figure and axes
# fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(15,15))
# axes = axes.ravel() # flattening the array makes indexing easier
# for col, ax in zip(cols, axes):
# sns.boxplot(x = data[col], data=data, orient='v', ax=ax, color = "pink")
# plt.subplots_adjust(wspace=0.5)
# ax.set_ylabel('')
# fig.tight_layout()
# plt.axis('off')
# plt.show()
Se realiza un analisis a las features numericas que tambien presentan numeros fuera de rango u outliers Como es un dataset pequeno se puede realizar de 1 a 1 pero en casos de mayot cantidad de columnas se deben crear metodos mas eficientes de quitar los valores fuera de rango
cols = data.select_dtypes([np.number]).columns
print (cols)
Index(['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines',
'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'Response', 'Complain', 'Edad', 'Dt_Customer_year', 'Dt_Customer_q',
'Dt_Customer_m', 'tiempo_participacion', 'Total_Compras'],
dtype='object')
data.shape
(2237, 27)
px.box(data_frame = data, x = 'Income')
# Income tiene datos que se escapan de la media
data[data["Income"] > 200000]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | Dt_Customer_year | Dt_Customer_q | Dt_Customer_m | tiempo_participacion | Total_Compras | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 527 | 1977 | Graduation | Married | 666666.00 | 1 | 0 | 2013-02-06 | 23 | 9 | 14 | ... | 3 | 6 | 0 | 0 | 46 | 2013 | 1 | 2 | 10 | 62 |
1 rows × 27 columns
# Se elimina e es solo una columna
data.drop(index=data[data.Income > 300000].index, inplace=True)
px.box(data_frame = data, x = 'MntMeatProducts')
data[data.MntMeatProducts > 1200]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | Dt_Customer_year | Dt_Customer_q | Dt_Customer_m | tiempo_participacion | Total_Compras | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 325 | 1977 | Graduation | Married | 157146.00 | 0 | 0 | 2013-04-29 | 13 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 46 | 2013 | 2 | 4 | 10 | 1730 |
| 497 | 1982 | PhD | Married | 160803.00 | 0 | 0 | 2012-04-08 | 21 | 55 | 16 | ... | 1 | 0 | 0 | 0 | 41 | 2012 | 2 | 4 | 11 | 1717 |
| 961 | 1979 | Graduation | Married | 2447.00 | 1 | 0 | 2013-06-01 | 42 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 44 | 2013 | 2 | 6 | 10 | 1730 |
| 1213 | 1978 | Master | Married | 52236.58 | 0 | 0 | 2012-12-08 | 53 | 32 | 2 | ... | 1 | 0 | 0 | 0 | 45 | 2012 | 4 | 12 | 11 | 1679 |
| 2204 | 1973 | PhD | Married | 157243.00 | 0 | 1 | 2014-01-03 | 98 | 20 | 2 | ... | 0 | 0 | 0 | 0 | 50 | 2014 | 1 | 1 | 9 | 1608 |
5 rows × 27 columns
data.MntMeatProducts.nlargest(10)
325 1725 961 1725 497 1622 1213 1607 2204 1582 1921 984 53 981 994 974 2021 968 1338 961 Name: MntMeatProducts, dtype: int64
# Valores fuera de rango se dejan al ultimo valor mas alto
data["MntMeatProducts"].clip(upper=984, inplace=True)
px.box(data_frame = data, x = 'MntSweetProducts')
data[data.MntSweetProducts > 200]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | Dt_Customer_year | Dt_Customer_q | Dt_Customer_m | tiempo_participacion | Total_Compras | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 210 | 1945 | PhD | Single | 113734.00 | 0 | 0 | 2014-05-28 | 9 | 6 | 2 | ... | 0 | 1 | 0 | 0 | 78 | 2014 | 2 | 5 | 9 | 277 |
| 449 | 1986 | Graduation | Single | 52236.58 | 1 | 0 | 2013-02-20 | 19 | 5 | 1 | ... | 0 | 1 | 0 | 0 | 37 | 2013 | 1 | 2 | 10 | 637 |
2 rows × 27 columns
data.MntSweetProducts.nlargest(10)
449 263 210 262 305 198 31 197 1502 196 2067 195 994 194 1143 194 1785 194 1055 192 Name: MntSweetProducts, dtype: int64
# Valores fuera de rango se dejan al ultimo valor mas alto
data.MntSweetProducts.clip(upper=198, inplace=True)
px.box(data_frame = data, x = 'MntGoldProds')
data[data.MntGoldProds > 250]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | Dt_Customer_year | Dt_Customer_q | Dt_Customer_m | tiempo_participacion | Total_Compras | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14 | 1969 | Graduation | Married | 4428.00 | 0 | 1 | 2013-05-10 | 0 | 16 | 4 | ... | 0 | 1 | 0 | 0 | 54 | 2013 | 2 | 5 | 10 | 359 |
| 60 | 1982 | Master | Single | 6560.00 | 0 | 0 | 2013-12-12 | 2 | 67 | 11 | ... | 1 | 17 | 0 | 0 | 41 | 2013 | 4 | 12 | 10 | 373 |
| 449 | 1986 | Graduation | Single | 52236.58 | 1 | 0 | 2013-02-20 | 19 | 5 | 1 | ... | 0 | 1 | 0 | 0 | 37 | 2013 | 1 | 2 | 10 | 637 |
| 2063 | 1966 | PhD | Single | 7144.00 | 0 | 2 | 2013-07-12 | 92 | 81 | 4 | ... | 1 | 0 | 0 | 0 | 57 | 2013 | 3 | 7 | 10 | 416 |
4 rows × 27 columns
data.MntGoldProds.nlargest()
449 362 14 321 2063 291 60 262 1789 249 Name: MntGoldProds, dtype: int64
# Valores fuera de rango se dejan al ultimo valor mas alto
data.MntGoldProds.clip(upper=249, inplace=True)
px.box(data_frame = data, x = 'NumDealsPurchases')
data[data.NumDealsPurchases > 5]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | Dt_Customer_year | Dt_Customer_q | Dt_Customer_m | tiempo_participacion | Total_Compras | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 26 | 1956 | Graduation | Married | 54450.00 | 1 | 1 | 2012-09-14 | 0 | 454 | 0 | ... | 8 | 8 | 0 | 0 | 67 | 2012 | 3 | 9 | 11 | 684 |
| 27 | 1956 | Graduation | Married | 54450.00 | 1 | 1 | 2012-09-14 | 0 | 454 | 0 | ... | 8 | 8 | 0 | 0 | 67 | 2012 | 3 | 9 | 11 | 684 |
| 64 | 1974 | Master | Divorced | 53367.00 | 1 | 1 | 2013-08-31 | 2 | 229 | 7 | ... | 8 | 7 | 1 | 0 | 49 | 2013 | 3 | 8 | 10 | 400 |
| 100 | 1948 | PhD | Single | 60200.00 | 0 | 1 | 2013-02-01 | 3 | 502 | 19 | ... | 11 | 6 | 0 | 0 | 75 | 2013 | 1 | 2 | 10 | 685 |
| 105 | 1949 | Master | Married | 62845.00 | 1 | 1 | 2012-01-10 | 3 | 1099 | 0 | ... | 10 | 8 | 0 | 0 | 74 | 2012 | 1 | 1 | 11 | 1178 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2204 | 1973 | PhD | Married | 157243.00 | 0 | 1 | 2014-01-03 | 98 | 20 | 2 | ... | 0 | 0 | 0 | 0 | 50 | 2014 | 1 | 1 | 9 | 1608 |
| 2210 | 1966 | Graduation | Married | 44529.00 | 0 | 1 | 2013-05-07 | 98 | 538 | 13 | ... | 11 | 5 | 0 | 0 | 57 | 2013 | 2 | 5 | 10 | 691 |
| 2216 | 1960 | PhD | Divorced | 50611.00 | 0 | 1 | 2012-04-10 | 98 | 459 | 0 | ... | 7 | 6 | 1 | 0 | 63 | 2012 | 2 | 4 | 11 | 493 |
| 2217 | 1960 | PhD | Divorced | 50611.00 | 0 | 1 | 2012-04-10 | 98 | 459 | 0 | ... | 7 | 6 | 0 | 0 | 63 | 2012 | 2 | 4 | 11 | 493 |
| 2218 | 1970 | Graduation | Single | 83273.00 | 1 | 2 | 2012-09-25 | 98 | 433 | 89 | ... | 9 | 7 | 0 | 0 | 53 | 2012 | 3 | 9 | 11 | 1392 |
147 rows × 27 columns
data.NumDealsPurchases.nlargest(8)
497 15 663 15 961 15 1416 15 1466 15 1718 15 2204 15 1640 13 Name: NumDealsPurchases, dtype: int64
# Valores fuera de rango se dejan al ultimo valor mas alto
data.NumDealsPurchases.clip(upper=13, inplace=True)
px.box(data_frame = data, x = 'NumWebPurchases')
data[data.NumWebPurchases > 20]
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | Dt_Customer_year | Dt_Customer_q | Dt_Customer_m | tiempo_participacion | Total_Compras | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14 | 1969 | Graduation | Married | 4428.00 | 0 | 1 | 2013-05-10 | 0 | 16 | 4 | ... | 0 | 1 | 0 | 0 | 54 | 2013 | 2 | 5 | 10 | 359 |
| 210 | 1945 | PhD | Single | 113734.00 | 0 | 0 | 2014-05-28 | 9 | 6 | 2 | ... | 0 | 1 | 0 | 0 | 78 | 2014 | 2 | 5 | 9 | 277 |
| 449 | 1986 | Graduation | Single | 52236.58 | 1 | 0 | 2013-02-20 | 19 | 5 | 1 | ... | 0 | 1 | 0 | 0 | 37 | 2013 | 1 | 2 | 10 | 637 |
| 2063 | 1966 | PhD | Single | 7144.00 | 0 | 2 | 2013-07-12 | 92 | 81 | 4 | ... | 1 | 0 | 0 | 0 | 57 | 2013 | 3 | 7 | 10 | 416 |
4 rows × 27 columns
data.NumWebPurchases.nlargest()
210 27 449 27 14 25 2063 23 41 11 Name: NumWebPurchases, dtype: int64
# Valores fuera de rango se dejan al ultimo valor mas alto
data.NumWebPurchases.clip(upper=11, inplace=True)
px.box(data_frame = data, x = 'NumCatalogPurchases')
data.NumCatalogPurchases.nlargest()
325 28 497 28 961 28 2204 22 292 11 Name: NumCatalogPurchases, dtype: int64
# Valores fuera de rango se dejan al ultimo valor mas alto
data.NumCatalogPurchases.clip(upper=11, inplace=True)
# Aumentar el espacio para el plot
plt.figure(figsize = (20, 10))
# Guardar las configuraciones del plot
heatmap = sns.heatmap(data.corr(), vmin =- 1, vmax = 1, annot = True)
# Titulos y otros settings
heatmap.set_title('Correlation Heatmap', fontdict = {'fontsize':12}, pad = 12);
Hay correlaciones fuertes entre:
Para mantener independencia se eliminan
# Como se han transformado columnas para generar nuevas, se eliminan las originales y total compras
data1 = data.copy()
data1.drop(columns=["Year_Birth", "Dt_Customer", "Total_Compras"],inplace=True)
temp = data1.copy()
temp.columns
Index(['Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
'NumStorePurchases', 'NumWebVisitsMonth', 'Response', 'Complain',
'Edad', 'Dt_Customer_year', 'Dt_Customer_q', 'Dt_Customer_m',
'tiempo_participacion'],
dtype='object')
One hot encoder de sklearn y Label Enconder
# one-hot-encode using sklearn, crea columnas adicionales y borra las originales
one_hot = OneHotEncoder(use_cat_names=True)
limpio = one_hot.fit_transform(temp)
limpio.head(5)
| Education_Graduation | Education_PhD | Education_Master | Education_Basic | Marital_Status_Divorced | Marital_Status_Single | Marital_Status_Married | Income | Kidhome | Teenhome | ... | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | Response | Complain | Edad | Dt_Customer_year | Dt_Customer_q | Dt_Customer_m | tiempo_participacion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 84835.00 | 0 | 0 | ... | 4 | 6 | 1 | 1 | 0 | 53 | 2014 | 2 | 6 | 9 |
| 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 57091.00 | 0 | 0 | ... | 3 | 7 | 5 | 1 | 0 | 62 | 2014 | 2 | 6 | 9 |
| 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 67267.00 | 0 | 1 | ... | 2 | 5 | 2 | 0 | 0 | 65 | 2014 | 2 | 5 | 9 |
| 3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 32474.00 | 1 | 1 | ... | 0 | 2 | 7 | 0 | 0 | 56 | 2014 | 4 | 11 | 9 |
| 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 21474.00 | 1 | 0 | ... | 1 | 2 | 7 | 1 | 0 | 34 | 2014 | 3 | 8 | 9 |
5 rows × 29 columns
# Guarda el dataset limpio para ser usado en autoML, modelamiento de algoritmos y otros
limpio.to_csv("../data/superstore_clean.csv", encoding='utf-8', index=False)